Setup dependancies
library(dplyr)
library(tibble)
library("tidyr")
library(ggplot2)
require(scales)
Loading required package: scales
Attaching package: ‘scales’
The following object is masked from ‘package:purrr’:
discard
The following object is masked from ‘package:readr’:
col_factor
library(remotes)
#remotes::install_github("Public-Health-Scotland/phsstyles",
## upgrade = "never"
#)git
library(phsstyles)
Vaccinations that include the time, for those who have had a serology sample taken
df_vacc <- readRDS("/conf/EAVE/GPanalysis/data/temp/sero_vacc_time.rds") %>% as_tibble()
nrow(df_vacc)
[1] 78473
Load serology Primary Care and print the number of rows of data there are. Also, extract the IgG quantative result as a number.
df_serology_pc <- readRDS("/conf/EAVE/GPanalysis/data/serology_primcare_march22.rds") %>%
as_tibble() %>%
dplyr::mutate(IgG = readr::parse_number(test_result_quant))
nrow(df_serology_pc)
[1] 59091
See the column names…
colnames(df_serology_pc)
[1] "EAVE_LINKNO" "LabSpecimenNo" "Sampledate_iso" "year" "isoweek" "sex"
[7] "age" "healthboard" "SpecimenOrigin" "DateCollected" "EcossDateReceived" "SpecimenDate"
[13] "SpecimenTypeRaw" "SpecimenType" "SpecLevel2Group" "SpecLevel1Group" "OrganismRaw" "Type"
[19] "OriginalOrganism" "test_result_qual" "test_result_quant" "QNum" "Text" "IgG"
Make a quick histogram plot of the number of samples collected over time
qplot(df_serology_pc$Sampledate_iso, geom="histogram", fill=I("red"), xlab='Sample Date', ylab='Number of Collected Samples', bins=30)
Count how many times each person has had a sample taken, and then count how many occurrences there are of people with nsamples
df_serology_pc %>% dplyr::group_by(EAVE_LINKNO) %>% dplyr::summarise(nsamples = dplyr::n()) %>%
dplyr::group_by(nsamples) %>% dplyr::summarise(noccurrences = dplyr::n())
Filter the dataframe to get the number of rows where there has been a positive test result
df_serology_pc_pos <- df_serology_pc %>% dplyr::filter(test_result_qual == "Positive")
nrow(df_serology_pc_pos)
[1] 20439
df_serology_bd <- readRDS("/conf/EAVE/GPanalysis/data/serology_snbts_march22.rds") %>%
as_tibble() %>%
dplyr::mutate(IgG = readr::parse_number(test_result_quant))
1 parsing failure.
row col expected actual
24758 -- a number SKIP
nrow(df_serology_bd)
[1] 39772
library(phsstyles)
Plot the number of samples collected over time for the blood donors
qplot(df_serology_bd$Sampledate_iso,
geom="histogram",
xlab='Sample Date', ylab='Number of Collected Samples', bins=40) + scale_colour_discrete_phs(palette = "main")
Again, filter the dataframe to get the number of rows where there has been a positive test result:
df_serology_bd_pos <- df_serology_bd %>% dplyr::filter(test_result_qual == "Positive")
nrow(df_serology_bd_pos)
[1] 18478
Check the number of repeat measurements of a person..
df_serology_bd %>% dplyr::group_by(EAVE_LINKNO) %>% dplyr::summarise(nsamples = dplyr::n()) %>%
dplyr::group_by(nsamples) %>% dplyr::summarise(noccurrences = dplyr::n())
Here we load the QCOVID dataframe and filter it on those EAVE studies that are present in the serology datasets.
df_qcovid <- readRDS("/conf/EAVE/GPanalysis/data/cleaned_data/QCOVID_feb22.rds") %>%
as_tibble()
df_qcovid_pc <- df_qcovid %>% dplyr::filter(EAVE_LINKNO %in% df_serology_pc$EAVE_LINKNO)
nrow(df_qcovid_pc)
[1] 46489
df_qcovid_bd <- df_qcovid %>% dplyr::filter(EAVE_LINKNO %in% df_serology_bd$EAVE_LINKNO)
nrow(df_qcovid_bd)
[1] 23210
df_ana_pc <- df_serology_pc %>% dplyr::left_join(df_qcovid_pc) %>%
dplyr::select(c("EAVE_LINKNO","Sampledate_iso","test_result_qual","IgG","n_risk_gps")) %>%
dplyr::mutate(n_risk_gps = ifelse(is.na(n_risk_gps), "0", levels(n_risk_gps)[n_risk_gps]))
Joining, by = "EAVE_LINKNO"
df_ana_pc_pos <- df_ana_pc %>% dplyr::filter(test_result_qual == "Positive")
nrow(df_ana_pc_pos)
[1] 20439
df_ana_pc %>% group_by(n_risk_gps) %>% summarise(counts=n())
Plot a histogram showing the linking of the serology data (primary care) with QCOVID
colnames(df_ana_pc)
[1] "EAVE_LINKNO" "Sampledate_iso" "test_result_qual" "IgG" "n_risk_gps"
colors <- c("Negative" = phs_colours(c("phs-blue")),"Equivocal" = phs_colours(c("phs-magenta")),"Positive" = phs_colours(c("phs-purple")))
colors
Negative Equivocal Positive
"#0078D4" "#9B4393" "#3F3685"
n <- nrow(df_ana_pc)
colors <- c("Negative" = phs_colours(c("phs-blue")),"Equivocal" = phs_colours(c("phs-magenta")),"Positive" = phs_colours(c("phs-purple")))
p <- ggplot(df_ana_pc, aes(x=IgG)) +
geom_histogram(position = "stack", bins=30, data=subset(df_ana_pc,test_result_qual == 'Negative'), aes(fill='Negative')) +
geom_histogram(position = "stack", bins=30, data=subset(df_ana_pc,test_result_qual == 'Positive'), aes(fill='Positive')) +
geom_histogram(position = "stack", bins=30, data=subset(df_ana_pc,test_result_qual == 'Equivocal'), aes(fill='Equivocal')) +
labs(title='Primary Care',x="Antibody Levels [IgG]", y="Number of Samples", fill="Result") +
scale_fill_manual(values = colors) +
theme(aspect.ratio = 0.5) +
scale_y_log10() +
scale_x_log10()
p
n <- nrow(df_ana_pc)
p <- ggplot(df_ana_pc, aes(x=IgG, fill=test_result_qual)) +
geom_histogram(position = "stack", bins=30) +
labs(title='Primary Care',x="Antibody Levels [IgG]", y="Number of Samples") +
theme(aspect.ratio = 0.5) +
scale_y_log10() +
scale_x_log10()
p
bw <- 50
n <- nrow(df_ana_pc)
p <- ggplot(df_ana_pc, aes(IgG)) +
geom_histogram(bins=20) +
labs(x="Primary Care IgG", y="Number of Samples", fill="Number of Risks (QCOVID)") +
scale_fill_discrete_phs(palette = "main") +
theme(aspect.ratio = 0.5)
p
df_ana <- df_ana_pc %>% left_join(df_vacc %>% filter(serology_source=='primary_care_serology'))
Joining, by = "EAVE_LINKNO"
Measurements taken after 1st Vac, before 2nd Vac
df_ana_v1 <- df_ana %>% filter(Sampledate_iso > d1_datetime & Sampledate_iso < d2_datetime )
library(lubridate)
# create breaks
breaks <- hour(hm("00:00", "6:00", "12:00", "18:00", "23:59"))
# labels for the breaks
labels <- c("Night", "Morning", "Afternoon", "Evening")
df_ana_v1$Time_of_day <- cut(x=hour(df_ana_v1$d1_datetime), breaks = breaks, labels = labels, include.lowest=TRUE)
Calculate the number of days the measurement was taken after the first vaccine
df_ana_v1 <- df_ana_v1 %>% mutate(days_passed = as.numeric(Sampledate_iso - d1_datetime,units='days'))
ggplot(df_ana_v1, aes(x=days_passed)) +
geom_histogram()
ggplot(df_ana_v1, aes(x=IgG)) +
geom_histogram()
ggplot(df_ana_v1, aes(x=days_passed, y=mean(IgG))) +
geom_boxplot(width=500)
bin_size <- 20
df_ana_v1 %>%
mutate(bin_dist = factor(days_passed%/%bin_size)) %>%
ggplot(aes(x = bin_dist, y = IgG)) +
geom_boxplot()
df_ana_v1 %>% select(c("Sampledate_iso","IgG","n_risk_gps","d1_product","d1_datetime","Time_of_day","days_passed")) %>% write.csv('test.csv')
Plot a histogram showing the linking of the serology data with QCOVID
df_ana_bd <- df_serology_bd %>% dplyr::left_join(df_qcovid_bd) %>%
dplyr::select(c("EAVE_LINKNO","Sampledate_iso","test_result_qual","IgG","n_risk_gps")) %>%
dplyr::mutate(n_risk_gps = ifelse(is.na(n_risk_gps), "0", levels(n_risk_gps)[n_risk_gps]))
Joining, by = "EAVE_LINKNO"
df_ana_bd_pos <- df_ana_bd %>% dplyr::filter(test_result_qual == "Positive")
min(df_ana_bd_pos$IgG)
[1] 1.1
max(df_ana_bd_pos$IgG)
[1] 11
Number of risk groups in the dataset
df_ana_bd %>% group_by(n_risk_gps) %>% summarise(n=n())
n <- nrow(df_ana_bd)
colors <- c("Negative" = phs_colours(c("phs-blue")),"Equivocal" = phs_colours(c("phs-magenta")),"Positive" = phs_colours(c("phs-purple")))
p <- ggplot(df_ana_bd, aes(x=IgG)) +
geom_histogram(position = "stack", bins=40, data=subset(df_ana_bd,test_result_qual == 'Negative'), aes(fill='Negative')) +
geom_histogram(position = "stack", bins=40, data=subset(df_ana_bd,test_result_qual == 'Positive'), aes(fill='Positive')) +
geom_histogram(position = "stack", bins=40, data=subset(df_ana_bd,test_result_qual == 'Equivocal'), aes(fill='Equivocal')) +
labs(title='Blood Donors',x="Antibody Levels [IgG]", y="Number of Samples", fill="Result") +
scale_fill_manual(values = colors) +
theme(aspect.ratio = 0.5) +
scale_y_log10()
p
All data
bw <- 0.5
n <- nrow(df_ana_bd_pos)
p <- ggplot(df_ana_bd_pos, aes(x=IgG, color=n_risk_gps)) +
stat_bin(geom="step",bins=10,position='identity', size=2, alpha=0.8) +
xlim(0,10) +
labs(title='Blood Donors',x="IgG", y="Number of Samples", color="Number of Risks (QCOVID)") +
scale_colour_discrete_phs(palette = "all") +
scale_y_continuous(trans = "log10") +
theme(aspect.ratio = 0.5)
p
Positive samples only…
df_ana_bd_pos %>% group_by(n_risk_gps) %>% summarise(n=n())
bw <- 100
n <- nrow(df_ana_pc_pos)
p <- ggplot(df_ana_pc_pos, aes(x=IgG, color=n_risk_gps)) +
stat_bin(geom="step",bins=10,position='identity', size=2, alpha=0.8) +
xlim(0,2000) +
labs(title='Primary Care',x="IgG", y="Number of Samples", color="Number of Risks (QCOVID)") +
scale_colour_discrete_phs(palette = "all") +
scale_y_continuous(trans = "log10") +
theme(aspect.ratio = 0.5)
p
bw <- 0.5
n <- nrow(df_ana_bd_pos)
p <- ggplot(df_ana_bd_pos, aes(x=IgG, fill=n_risk_gps)) +
geom_histogram(binwidth = bw) + xlim(0,11+bw) +
labs(x="Blood Donors IgG", y="Number of Samples", fill="Number of Risks (QCOVID)") +
scale_fill_discrete_phs(palette = "main") +
scale_y_log10() +
theme(aspect.ratio = 0.5)
p
df_ana_bd
df <- readRDS("/conf/EAVE/GPanalysis/data/serology_primcare_march22.rds") %>% dplyr::left_join(df_qcovid_pc) %>% mutate(age_group = cut(age,right=FALSE, breaks = c(0,10,20,30,40,50,60,70,80,90,1000),
label=c("0-9",
"10-19",
"20-29",
"30-39",
"40-49",
"50-59",
"60-69",
"70-79",
"80-89",
"90+")))
Joining, by = "EAVE_LINKNO"
df
max_prop <- 30 # choose the highest proportion you want to show
step <- 5 # choose the space you want beween labels
## this part defines vector using the above numbers with axis breaks
breaks <- c(
seq(max_prop/100 * -1, 0 - step/100, step/100),
0,
seq(0 + step / 100, max_prop/100, step/100)
)
## this part defines vector using the above numbers with axis limits
limits <- c(max_prop/100 * -1, max_prop/100)
## this part defines vector using the above numbers with axis labels
labels <- c(
seq(max_prop, step, -step),
0,
seq(step, max_prop, step)
)
p1 <- df %>%
## make sure the variables are factors
mutate(age_group = factor(n_risk_gps),
sex = factor(sex)) %>%
age_pyramid(
age_group = "age_group",
split_by = "sex",
proportion = TRUE) +
## only show the x axis label (otherwise repeated in all three plots)
labs(title = "Primary Care",
x = "N Risk Groups",
y = "% of Sample") +
## make the x axis the same for all plots
scale_y_continuous(breaks = breaks,
limits = limits,
labels = labels)
9454 missing rows were removed (9454 values from `age_group` and 0 values from `sex`).Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
p1
df <- readRDS("/conf/EAVE/GPanalysis/data/serology_snbts_march22.rds") %>% dplyr::left_join(df_qcovid_bd)
Joining, by = "EAVE_LINKNO"
df
max_prop <- 40 # choose the highest proportion you want to show
step <- 5 # choose the space you want beween labels
## this part defines vector using the above numbers with axis breaks
breaks <- c(
seq(max_prop/100 * -1, 0 - step/100, step/100),
0,
seq(0 + step / 100, max_prop/100, step/100)
)
## this part defines vector using the above numbers with axis limits
limits <- c(max_prop/100 * -1, max_prop/100)
## this part defines vector using the above numbers with axis labels
labels <- c(
seq(max_prop, step, -step),
0,
seq(step, max_prop, step)
)
p2 <- df %>%
## make sure the variables are factors
mutate(age_group = factor(n_risk_gps),
sex = factor(sex)) %>%
age_pyramid(
age_group = "age_group",
split_by = "sex",
proportion = TRUE) +
## only show the x axis label (otherwise repeated in all three plots)
labs(title = "Blood Donors",
x = "N Risk Groups",
y = "% of Sample") +
## make the x axis the same for all plots
scale_y_continuous(breaks = breaks,
limits = limits,
labels = labels)
10775 missing rows were removed (10775 values from `age_group` and 0 values from `sex`).Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
p2
df <- readRDS("/conf/EAVE/GPanalysis/data/EAVE_demographics_SK.rds") %>% left_join(df_qcovid)
Joining, by = c("EAVE_LINKNO", "Sex")
df
max_prop <- 30 # choose the highest proportion you want to show
step <- 5 # choose the space you want beween labels
## this part defines vector using the above numbers with axis breaks
breaks <- c(
seq(max_prop/100 * -1, 0 - step/100, step/100),
0,
seq(0 + step / 100, max_prop/100, step/100)
)
## this part defines vector using the above numbers with axis limits
limits <- c(max_prop/100 * -1, max_prop/100)
## this part defines vector using the above numbers with axis labels
labels <- c(
seq(max_prop, step, -step),
0,
seq(step, max_prop, step)
)
p3 <- df %>%
## make sure the variables are factors
mutate(age_group = factor(n_risk_gps),
sex = factor(Sex)) %>%
age_pyramid(
age_group = "age_group",
split_by = "sex",
proportion = TRUE) +
## only show the x axis label (otherwise repeated in all three plots)
labs(title = "EAVE-II",
x = "N Risk Groups",
y = "% of Sample") +
## make the x axis the same for all plots
scale_y_continuous(breaks = breaks,
limits = limits,
labels = labels)
1760162 missing rows were removed (1760162 values from `age_group` and 0 values from `sex`).Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
p3
grid.arrange(p3,p1,p2,ncol=3,nrow=1)
max_prop <- 40 # choose the highest proportion you want to show
step <- 5 # choose the space you want beween labels
## this part defines vector using the above numbers with axis breaks
breaks <- c(
seq(max_prop/100 * -1, 0 - step/100, step/100),
0,
seq(0 + step / 100, max_prop/100, step/100)
)
## this part defines vector using the above numbers with axis limits
limits <- c(max_prop/100 * -1, max_prop/100)
## this part defines vector using the above numbers with axis labels
labels <- c(
seq(max_prop, step, -step),
0,
seq(step, max_prop, step)
)
p2 <- df %>%
## make sure the variables are factors
mutate(age_group = factor(age_group),
sex = factor(Sex)) %>%
age_pyramid(
age_group = "age_group",
split_by = "sex",
proportion = TRUE) +
## only show the x axis label (otherwise repeated in all three plots)
labs(title = "Blood Donors",
x = "N Risk Groups",
y = "% of Sample") +
## make the x axis the same for all plots
scale_y_continuous(breaks = breaks,
limits = limits,
labels = labels)
grid.arrange(p1,p2,p2,ncol=3,nrow=1)